{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Collect data from Twittter"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "import requests\n",
    "import os\n",
    "import json\n",
    "import numpy as np\n",
    "import time\n",
    "import random\n",
    "from json.decoder import JSONDecodeError\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "def truncated_exponential_backoff(attempt):\n",
    "    \"\"\"\n",
    "    Sleep using a truncated exponential backoff strategy.\n",
    "\n",
    "    attempt: number of consecutive retries so far\n",
    "    \"\"\"\n",
    "    max_attempt = 6\n",
    "    max_sleep = 64\n",
    "\n",
    "    attempt = min(attempt, max_attempt)\n",
    "    sleep_time = min((2 ** attempt) + random.random(), max_sleep)\n",
    "\n",
    "    time.sleep(sleep_time)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Preparing the keywords for collecting tweets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('provax_keywords.txt') as f:\n",
    "    provax_keywords = f.read().splitlines()\n",
    "\n",
    "with open('antivax_keywords.txt') as f:\n",
    "    antivax_keywords = f.read().splitlines()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "viz_keywords = \"chart OR charts OR plot OR plots OR map OR maps OR dashboard OR dashboards OR vis OR viz OR visualization OR visualizations\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_keywords = \"data OR stats OR statistics \" + viz_keywords\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Collecting tweets that contain the keywords"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "def bearer_oauth(r):\n",
    "    \"\"\"\n",
    "    Method required by bearer token authentication.\n",
    "    \"\"\"\n",
    "\n",
    "    r.headers[\"Authorization\"] = f\"Bearer {bearer_token}\"\n",
    "    r.headers[\"User-Agent\"] = \"v2FullArchiveSearchPython\"\n",
    "    return r\n",
    "\n",
    "\n",
    "def connect_to_endpoint(url, params):\n",
    "    response = requests.request(\"GET\", search_url, auth=bearer_oauth, params=params)\n",
    "    #print(response.status_code)\n",
    "    if response.status_code != 200:\n",
    "        raise Exception(response.status_code, response.text)\n",
    "    return response.json()\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "def fetch_tweets(\n",
    "    keywords,\n",
    "    query_params,\n",
    "    base_query,\n",
    "    search_url,\n",
    "    output_file,\n",
    "):\n",
    "    n_tweets = 0\n",
    "\n",
    "    for keyword in keywords:\n",
    "        n = 0\n",
    "        print(keyword)\n",
    "\n",
    "        query_params_current = query_params.copy()\n",
    "        while True:\n",
    "            try:\n",
    "                query_params_current[\"query\"] = base_query + \" #\" + keyword\n",
    "                new_data = connect_to_endpoint(search_url, query_params_current)\n",
    "                n_tweets = n_tweets + new_data[\"meta\"][\"result_count\"]\n",
    "\n",
    "                if \"data\" in new_data.keys():\n",
    "                    with open(output_file, \"r+\") as f:\n",
    "                        data = json.load(f)\n",
    "                        data = data + new_data[\"data\"]\n",
    "                        f.seek(0)\n",
    "                        json.dump(data, f)\n",
    "\n",
    "                    while \"next_token\" in new_data[\"meta\"].keys():\n",
    "                        query_params_current[\"next_token\"] = new_data[\"meta\"][\"next_token\"]\n",
    "                        while True:\n",
    "                            try:\n",
    "                                new_data = connect_to_endpoint(\n",
    "                                    search_url, query_params_current\n",
    "                                )\n",
    "                                n_tweets = n_tweets + new_data[\"meta\"][\"result_count\"]\n",
    "\n",
    "                                for tweet in new_data[\"data\"]:\n",
    "                                    tweet[\"keyword\"] = keyword\n",
    "\n",
    "                                with open(output_file, \"r+\") as f:\n",
    "                                    data = json.load(f)\n",
    "                                    data = data + new_data[\"data\"]\n",
    "                                    f.seek(0)\n",
    "                                    json.dump(data, f)\n",
    "                            except:\n",
    "                                truncExpBackOff(n)\n",
    "                                n = n + 1\n",
    "                                continue\n",
    "                            break\n",
    "\n",
    "            except:\n",
    "                # getting backoff from the website, execute the truncated exponential backoff algorithm\n",
    "                truncExpBackOff(n)\n",
    "                n = n + 1\n",
    "                continue\n",
    "            break\n",
    "\n",
    "    return n_tweets\n",
    "\n",
    "      "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 264,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Read the bearer token from an environment variable\n",
    "bearer_token = os.environ.get(\"BEARER_TOKEN\")\n",
    "if not bearer_token:\n",
    "    raise RuntimeError(\"Missing BEARER_TOKEN environment variable.\")\n",
    "\n",
    "SEARCH_URL = \"https://api.twitter.com/2/tweets/search/all\"\n",
    "\n",
    "START_TIME = \"2020-10-18T00:00:00Z\"\n",
    "END_TIME = \"2021-04-01T00:00:00Z\"\n",
    "\n",
    "query = f\"-is:retweet lang:en ({data_keywords})\"\n",
    "\n",
    "TWEET_FIELDS = \",\".join([\n",
    "    \"attachments\",\n",
    "    \"author_id\",\n",
    "    \"context_annotations\",\n",
    "    \"created_at\",\n",
    "    \"entities\",\n",
    "    \"geo\",\n",
    "    \"id\",\n",
    "    \"in_reply_to_user_id\",\n",
    "    \"lang\",\n",
    "    \"possibly_sensitive\",\n",
    "    \"public_metrics\",\n",
    "    \"referenced_tweets\",\n",
    "    \"source\",\n",
    "    \"text\",\n",
    "    \"withheld\",\n",
    "])\n",
    "\n",
    "query_params = {\n",
    "    \"query\": query,\n",
    "    \"start_time\": START_TIME,\n",
    "    \"end_time\": END_TIME,\n",
    "    \"tweet.fields\": TWEET_FIELDS,\n",
    "}\n",
    "\n",
    "base_query = query_params[\"query\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "n_tweets_provax = fetch_tweets(\n",
    "    keywords=provax_keywords,\n",
    "    query_params=query_params,\n",
    "    base_query=base_query,\n",
    "    search_url=search_url,\n",
    "    output_file=\"pro_vax.json\",\n",
    ")     \n",
    "      "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "n_tweets_antivax = fetch_tweets_for_lines(\n",
    "    keywords=antivax_keywords,\n",
    "    query_params=query_params,\n",
    "    base_query=base_query,\n",
    "    search_url=search_url,\n",
    "    output_file=\"anti_vax.json\",\n",
    ")     \n",
    "    "
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [conda env:base] *",
   "language": "python",
   "name": "conda-base-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
